# Computations
import pandas as pd
import numpy as np
# sklearn
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
# KMeans
from sklearn.cluster import KMeans
# preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
# keras
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD, Adagrad
from keras.utils.vis_utils import plot_model
import keras.backend as K
# Text
from colorama import Fore, Back, Style
# Visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
from yellowbrick.features.pcoords import parallel_coordinates
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
# sns setting
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")
# plt setting
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze a weather dataset from Kaggle.com. Data description from Kaggle:
Data = pd.read_csv('weatherdata/daily_weather.csv')
Data.drop(columns = ['number'], inplace = True)
Data.head().style.hide_index().set_precision(2)
| Columns | Description |
|---|---|
| Air Pressure | Air pressure StartFragment in hectopascal (100 pascals) at 9 AM |
| Air Temperature | Air temperature in degrees Fahrenheit at 9 AM |
| Avg Wind Direction | Average wind direction over the minute before the timestamp in degrees (0 starts from the north) at 9 AM |
| Avg Wind Speed | Average wind speed over the minute before the timestamp in meter per seconds (m/s) at 9 AM |
| Max Wind Direction | Highest wind direction in the minute before the timestamp in degrees (0 starts from the north) at 9 AM |
| Max Wind Speed | Highest wind speed in the minute before the timestamp in meter per seconds (m/s) at 9 AM |
| Min Wind Speed | Smallest wind speed in the minute before the timestamp in meter per seconds (m/s) at 9 AM |
| Rain Accumulation | Accumulated rain in millimeters (mm) at 9 AM |
| Rain Duration | Length of time rain in seconds (s) at 9 AM |
| Relative Humidity (Morning) | Relative humidity in percentage in at 9 AM |
| Relative Humidity (Afternoon) | Relative humidity in percentage at 3 PM |
For convenience, we would like to modify the feature names.
Data.columns = [x.replace('ty_9am','ty_(Morning)').replace('3pm', '(Afternoon)').replace('_9am', '').replace('_',
' ').title().replace('Temp','Temperature') for x in Data.columns.tolist()]
Data.head(5).style.hide_index().set_precision(2)
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
Temp = Data_info(Data, Only_NaN = True)
display(Temp)
Temp = Temp.index.tolist()
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
Data[Temp] = imp.fit_transform(Data[Temp])
Data_info(Data)
Let's set Relative Humidity (Afternoon) as the target variable. This means given the dataset and using the rest of the features, we would like to know whether is humid or not at 3 PM. In doing so, define a Humidity Level (Afternoon) feature as follows:
N = 4
Target = 'Humidity Level (Afternoon)'
Data[Target], bins = pd.qcut(Data['Relative Humidity (Afternoon)'], precision =2, retbins= True, q=4, labels=np.arange(0, 4, 1))
df = Data.drop(columns = ['Relative Humidity (Afternoon)'])
Range_dict = dict(list(enumerate(['(%.2f, %.2f]' % (bins[i], bins[i+1]) for i in range(N)])))
del bins
Furthemore, let's look at the variance of our dataset features.
display(df.iloc[:,:-1].var().sort_values(ascending = False).to_frame(name= 'Variance')\
.style.background_gradient(cmap='OrRd').set_precision(2))
Furthermore, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().
scaler = StandardScaler()
df.iloc[:,:-1] = scaler.fit_transform(df.iloc[:,:-1])
display(df.iloc[:,:-1].var().sort_values(ascending = False).to_frame(name= 'Variance')\
.style.background_gradient(cmap=sns.light_palette("green", as_cmap=True)).set_precision(2))
df.describe().style.set_precision(2)
def Correlation_Plot (Df,Fig_Size):
Correlation_Matrix = Df.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .6})
Correlation_Plot (df, 8)
Let's visualize the data using Parallel Coordinates.
X = df.drop(columns = [Target])
y = df[Target]
C = ["#3498db", "#e74c3c", "#34495e", "#2ecc71"]
fig, axes = plt.subplots(nrows=1, ncols=1, figsize = (15, 8))
visualizer = parallel_coordinates(X, y, ax=axes, classes=[Range_dict[i] for i in range(len(Range_dict))],
features= X.columns.tolist(),
colors = C,
# colors = sns.color_palette("bright", N),
normalize='standard', sample=0.05, shuffle=True)
del X, y
However, the results of this visualization can be improved if a clustering method is used. For this reason, we K-Means clustering method.
kmeans = KMeans(n_clusters = N)
Temp = df.drop(columns = Target)
model = kmeans.fit(Temp)
Out = pd.DataFrame(model.cluster_centers_, columns = df.iloc[:,1:].columns.tolist())
Out[Target] = np.sort(df[Target].unique().tolist())
Out.style.hide_index()
Temp = Out.copy()
Temp['Humidity Level (Afternoon)'] = Temp['Humidity Level (Afternoon)'].map(Range_dict)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (16, 8))
_ = pd.plotting.parallel_coordinates(Temp, 'Humidity Level (Afternoon)', lw = 2, color = C, ax = ax)
_ = ax.set_ylim([-3, +10])
_ = ax.set_xticklabels(labels = Temp.columns.tolist(), rotation= 45)
_ = ax.legend(title = 'Humidity Level (Afternoon)', loc = 'upper left', fontsize = 13)
X = Data.drop(columns = [Target])
y = pd.get_dummies(Data[Target]).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Here, we implement an artificial neural network (ANN) using Keras.
model = Sequential()
model.add(Dense(12, input_dim= X.shape[1], init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='sigmoid'))
model.add(Dense(4, init='uniform', activation='sigmoid'))
model.add(Dense(y.shape[1], init='uniform', activation='sigmoid'))
# Number of iterations
IT = int(1e3)+1
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy','mae', 'mse'])
# Train model
history = model.fit(X_train, y_train, nb_epoch= IT, batch_size=50, verbose=0)
# Predications and Score
y_pred = model.predict(X_test)
score = model.evaluate(X_test, y_test)
score = pd.DataFrame(score, index = model.metrics_names).T
history = pd.DataFrame(history.history)
display(score.style.hide_index())
fig = go.Figure()
fig.add_trace(go.Scatter(x= history.index.values, y= history['loss'].values, line=dict(color='OrangeRed', width= 1.5),
name = 'Loss'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['accuracy'].values, line=dict(color='MidnightBlue', width= 1.5),
name = 'Accuracy'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['mae'].values, line=dict(color='ForestGreen', width= 1.5),
name = 'Mean Absolute Error (MAE)'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['mse'].values, line=dict(color='purple', width= 1.5),
name = 'Mean Squared Error (MSE)'))
fig.update_layout(legend=dict(y=0.5, traceorder='reversed', font_size=12))
fig.update_layout(dragmode='select', plot_bgcolor= 'white', height=600, hovermode='closest')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig['layout']['xaxis'].update(range=[0, history.index.values.max()])
fig['layout']['yaxis'].update(range=[0, 1.6])
fig.show()
Finally, a summary and a glimpse of the model.
model.summary()
plot_model(model, show_shapes=True, show_layer_names=True, expand_nested = True)